Cross Referencing Endowment Values
endowment_data <- read_rds(here("data", "endowment_filter_data_990.RDS"))
companies_to_ein <- read_csv(here("data", "companies.csv")) %>%
mutate(EIN = as.character(ein))# make kable table with consistent formatting
make_table <- function(df, title = "", ...) {
title <- paste0("<center><span style = 'font-size:110%;color:black'><b>",
title,
"</span></b><center>")
as_tibble(df) %>%
kbl(caption = title, ... ) %>%
kable_material() %>%
row_spec(row=0, background = "#43494C" , color = "white", bold = TRUE)
}Notes on Strategy
We want to compare the current year variables CY to the
current year minus X years variables labelled CYX. To do
this, we can:
- structure the data so each company has all available years (but all NAs for years where they had no data)
- order by fiscal year
- subtract the lagged
CYvariable from theCYXvariable where the lag isXyears. For example, forCYM1we want to compare to theCYjust one year ago, so lagged one year.
In this way, we obtain a collection of differences between reports that should be in concordance but are not always.
# plot missingness for a given variable
# number of observations = number of observations
# where that EIN had that variable (not NA)
plot_missing <- function(variable) {
endowment_data %>%
group_by(EIN) %>%
# number of observations where variable is not NA
summarize(number_observations = sum(!is.na(!!sym(variable)))) %>%
group_by(number_observations) %>%
# number of EINs with each value of number_observations
summarize(n_ein=n()) %>%
ggplot(aes(x = number_observations, y =n_ein ))+
geom_bar(stat="identity") +
labs(y = "Number of Companies",
x = paste0("Number of Observations where\n",
variable, " was Not Missing"),
title =paste0("Missingness for ", variable)) +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = .5))
}
# compare values from CY to CYM* for given variable
# returns data frame that contains the difference between the CY value and
# corresonding CM* values
# for example, the difference between the CY value for 2016 would be compared
# to the CYM1 value for 2017, and the CYM2 value for 2018, and so on
check_variable <- function(variable_name,
data) {
base_name <- variable_name
var <- paste0("CY", base_name)
vars <- paste0("CYM", c( 1:4), base_name)
# plt <- plot_missing(var)
# print(plt)
eins_with_variable <- data %>%
group_by(EIN) %>%
summarize(number_observations = sum(!is.na(!!sym(var)))) %>%
filter(number_observations != 0) %>%
pull(EIN)
# the goal here is to create a row for each fiscal year, with NAs if
# there are no observations for that year
# this is needed so that we have consecutive years, which is important
# for substraction using lag() to work correctly
data <- data %>%
filter(EIN %in% eins_with_variable) %>%
select(EIN, fiscal_year, contains(base_name)) %>%
pivot_wider(names_from = fiscal_year,
# names_prefix = "fiscalyear",
values_from=contains(base_name)) %>%
pivot_longer(cols = contains(base_name),
names_to = "variable_year") %>%
separate(variable_year, sep = "_", into = c("variable_name", "fiscal_year")) %>%
pivot_wider(names_from = variable_name, values_from = value) %>%
mutate(fiscal_year = as.factor(as.numeric(fiscal_year)))
crossref <- data %>%
group_by(EIN) %>%
arrange(fiscal_year) %>%
# lag corresponds to how far back the current year comparison should be
# vars contains the CM* variables that represent reporting for years back
# compare these CM* variables to the lagged current year (CY) variables
mutate(
difference_in_reported_year1 = !!sym(vars[1]) -
lag(!!sym(var), n =1),
difference_in_reported_year2 = !!sym(vars[2]) -
lag(!!sym(var), n =2),
difference_in_reported_year3 = !!sym(vars[3]) -
lag(!!sym(var), n =3),
difference_in_reported_year4 = !!sym(vars[4]) -
lag(!!sym(var), n =4)
) %>%
ungroup()
}Cross Referencing Beginning Year Balance Amount
Comparison Across Years
As we might expect, we see that a higher proportion had a nonzero difference between the cross referenced reports for years further back in time. That is, reporting tended to be more accurate for most recent years.
crossref <- check_variable("BeginningYearBalanceAmt", data = endowment_data)
plot_missing("CYBeginningYearBalanceAmt")# plot fraction where there was a difference between
# the reports by year
crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
filter(!is.na(value)) %>%
group_by(name) %>%
summarize(number_zeros = sum(ifelse(value == 0, 1,0)),
total_reports = n(),
fraction = 1-( number_zeros / total_reports)) %>%
mutate(name = gsub("difference_in_reported_year", "", name)) %>%
ggplot(aes(x=name, y = fraction)) +
geom_bar(stat ="identity", fill = "#234A77") +
geom_label(aes(label = round(fraction,2))) +
labs(title = paste0("Fraction of Differences that Were Nonzero\n",
"Between Cross Referenced Reports"),
subtitle = "By Year",
x = "Years Between Reports Compared",
y = "Fraction with Nonzero Difference") +
theme_bw() +
theme(plot.title = element_text(hjust = .5, face="bold"),
plot.subtitle = element_text(hjust = .5, face="italic"))We also see we have fewer total comparisons of reports as we go back further back in time, because we can’t compute the 4 year comparison for any date where we don’t have a value 4 years back.
# stacked chart, note we can't see how nonzero counts are changing
# relative to the total counts
crossref %>%
select(EIN, contains("difference"), fiscal_year) %>%
pivot_longer(cols = contains("difference")) %>%
filter(!is.na(value)) %>%
group_by(name) %>%
summarize(zero = sum(ifelse(value == 0, 1,0)),
nonzero = sum(ifelse(value == 0, 0,1))) %>%
# notice each row represents a fiscal_year-EIN-difference_type
pivot_longer(cols = c(zero, nonzero),
names_to = "source",
values_to = "count") %>%
mutate(name = gsub("difference_in_reported_year", "", name),
source = ifelse(source == "nonzero",
"Nonzero Difference",
"Zero Difference")) %>%
ggplot(aes(x=name, y = count, fill = source)) +
geom_bar(stat ="identity", position = "stack", alpha = .8) +
geom_label(aes(label = round(count,3), y = count, color = source),
position = "stack",
size = 2.6,
label.padding = unit(.1, "lines"),
fill = "white",
fontface="bold",
show.legend = FALSE) +
labs(title = "Number of Zero and Nonzero Differences\nBetween Cross Referenced Reports",
subtitle = "By Year",
x = "Years Between Reports Compared",
y = "Count",
fill = "") +
theme_bw() +
theme(plot.title = element_text(size = 16, hjust = .5, face="bold"),
plot.subtitle = element_text(hjust = .5, face="italic"),
axis.text.x = element_text(size = 13),
axis.title = element_text(size = 16, face = "bold"))Companies with Discordance in Reported Values
# difference represents What They Reported as CY Minus X Years - What They Reported at The Time
companies_different <- crossref %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
filter(value > 0) %>%
left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
arrange(organization_name) %>%
pull(EIN) %>%
unique()
crossref %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
filter(value > 0) %>%
left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
mutate(year = substr(name, nchar(name), nchar(name)),
year = paste0("Comparing Current<br> Year Minus ",
year)) %>%
arrange(organization_name) %>%
select(`Organization Name` = organization_name,
`Difference in Years` = year,
`Fiscal Year` = fiscal_year,
`Recent - Previously Reported` = value) %>%
make_table(title = paste0(
"Comparing Values Reported in More Recent Report to Those Previously Reported:<br>",
"<i>Number of Companies that have at Least One Report Not Concordant: </i>",
length(companies_different)),
digits = 3,
format.args = list(
big.mark = ",",
scientific = FALSE),
escape=FALSE,
booktabs=TRUE) %>%
scroll_box(height = "450px",
width = "100%") | Organization Name | Difference in Years | Fiscal Year | Recent - Previously Reported |
|---|---|---|---|
| Ballet Arizona |
Comparing Current Year Minus 1 |
2018 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 2 |
2018 | 500,000 |
| Ballet Arizona |
Comparing Current Year Minus 2 |
2019 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2019 | 500,000 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2020 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 4 |
2020 | 500,000 |
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2018 | 26,128 |
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2019 | 13,343 |
| Fort Wayne Ballet |
Comparing Current Year Minus 2 |
2019 | 148,799 |
| Fort Wayne Ballet |
Comparing Current Year Minus 2 |
2020 | 13,343 |
| Fort Wayne Ballet |
Comparing Current Year Minus 3 |
2020 | 148,799 |
| Pacific Northwest Ballet |
Comparing Current Year Minus 1 |
2019 | 3,000 |
| Pacific Northwest Ballet |
Comparing Current Year Minus 2 |
2020 | 3,000 |
| San Francisco Ballet |
Comparing Current Year Minus 1 |
2017 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2017 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2018 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2018 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2019 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2019 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2020 | 107,033,401 |
| The Alabama Ballet |
Comparing Current Year Minus 1 |
2019 | 227,040 |
| The Alabama Ballet |
Comparing Current Year Minus 2 |
2020 | 227,040 |
| The Alabama Ballet |
Comparing Current Year Minus 3 |
2020 | 219,787 |
| The Alabama Ballet |
Comparing Current Year Minus 4 |
2020 | 254,152 |
We see that values are repeated because if there is some value that is quite off, say for 2016, then this shows up in the CYM1 for 2017, but also CYM2 for 2018, CYM3 for 2019 and so on.
Tables of Reported Values for Each Company with Discordance in Reported Values
Interpretation:
- The easiest way to interpret the company-specific tables is to look
diagonally left-to right. For example, 2018
CYshould match 2019CYM1, 2020CYM2, and 2021CYM3(though the 2021 values often are NA at this time).
Observations:
- We see in some cases, the problematic reports are clear initially. This is the case in San Francisco Ballet, Ballet Arizona, or the Alabama Ballet.
- The differences for Fort Wayne Ballet and the Pacific Northwest Ballet are more subtle.
# iterate through EINs where there was discordance and
# generate a table so we can better see what's going on
variable_name <- "BeginningYearBalanceAmt"
walk(1:length(companies_different), ~{
name <- companies_to_ein %>%
filter(EIN == companies_different[.x]) %>%
pull(organization_name)
table <- crossref %>%
rename_with(cols=everything(), ~gsub(variable_name, "", .)) %>%
filter(EIN %in% companies_different[.x]) %>%
select(-c(EIN, contains("difference"))) %>%
make_table(title = paste0("Reports for ",
name, "<br>EIN: ",
companies_different[.x],
", Variable: ", variable_name))
print(table)
# print(table)
})crossref %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
# filter(value > 0) %>%
left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
mutate(year = substr(name, nchar(name), nchar(name)),
year = paste0("Comparing Current Year Minus ",
year)) %>%
arrange(organization_name) %>% View()Cross Referencing All Endowment Variables
Missingness by Variable
variables_to_check <- endowment_data %>%
select(contains("CY")) %>%
colnames() %>%
gsub("CY|CYM.", "",.) %>%
unique()
crossref_all <- map_df(
variables_to_check,
~{ variable_name <- .x
check_variable(variable_name,
data = endowment_data) %>%
# remove variable name part of column name
# so we can bind rows together, add this information
# as a separate column
rename_with(cols=everything(),
~gsub(variable_name, "", .)) %>%
mutate(variable = .x)
})
missing_all <- map_df( variables_to_check,
~ {variable <- paste0("CY",.x)
endowment_data %>%
group_by(EIN) %>%
summarize(number_observations = sum(!is.na(!!sym(variable)))) %>%
group_by(number_observations) %>%
summarize(number_eins=n()) %>%
mutate(variable = variable)
})colors <- c("#58b5e1", "#49406e", "#9dd84e", "#6633b4", "#46ebdc")
missing_all %>%
mutate(number_observations = paste0(
"Number of EINS with ",
number_observations,
" Observations for this Variable" )) %>%
ggplot(aes(x = variable, y =number_eins, fill = variable))+
geom_bar(stat="identity",
position = "dodge",
show.legend=FALSE) +
geom_label(aes(label = number_eins,
color = variable),
fill = "white",
vjust = .5,
size = 2,
position = position_dodge(1),
label.padding = unit(.1, "lines"),
show.legend=FALSE) +
facet_wrap(~number_observations, ncol=1) +
coord_flip() +
labs(y = "Number of Companies",
x = "Variable Name",
title = "Comparing Missingness Across Variables") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = .5),
axis.title = element_text(face = "bold")) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
scale_y_continuous(n.breaks = 8) Fraction Discordant by Variable
# plot fraction discordant for each variable
crossref_all %>%
select(EIN, contains("difference"), variable) %>%
pivot_longer( contains("difference")) %>%
filter(!is.na(value)) %>%
group_by(variable) %>%
summarize(
number_of_discordant_observations = sum(value > 1),
total_observations_of_variable = n(),
fraction_discordant = number_of_discordant_observations / total_observations_of_variable) %>%
ggplot(aes(x = fct_reorder(variable,
fraction_discordant,
.desc = TRUE),
y = fraction_discordant)) +
geom_bar(stat="identity",
fill = "#234A77")+
geom_label(aes(label = round(fraction_discordant, 3))) +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = .5, size = 16),
axis.title = element_text(face = "bold", size =16),
axis.text.x = element_text(size = 12, angle = 10, vjust = .6)) +
labs(y = "Fraction Discordant",
x = "Endowment Variable",
title = "Fraction of Observations that Were Discordant for Each Variable")# generate table displaying the discordant values for a given variable
get_discordant_table <- function(variable_name, data) {
# observations with nonzero difference
cross_ref_for_var <- data %>%
filter(variable == variable_name) %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
filter(value > 0)
# EINs that have at least one discordance
discordant <- cross_ref_for_var %>%
pull(EIN) %>% unique()
# generate table displaying discordances
cross_ref_for_var %>%
left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
mutate(year = substr(name, nchar(name), nchar(name)),
year = paste0("Comparing Current<br> Year Minus ",
year)) %>%
arrange(organization_name) %>%
select(`Organization Name` = organization_name,
`Difference in Years` = year,
`Fiscal Year` = fiscal_year,
`Recent - Previously Reported` = value) %>%
make_table(title = paste0("Variable: ",
variable_name,
"<br>Comparing Values Reported in More Recent Report to Those Previously Reported:<br>",
"<i>Number of Companies that have at Least One Report Not Concordant: </i>",
length(discordant)),
digits = 3,
format.args = list(
big.mark = ",",
scientific = FALSE),
escape=FALSE,
booktabs=TRUE) %>%
scroll_box(height = "450px",
width = "100%")
}
# iterate over all variables to check and generate table
walk(variables_to_check, ~{
table_for_var <- get_discordant_table(.x, data = crossref_all)
print(table_for_var)
})Companies with Discordant Reporting for at Least One Variable
# variables corresponding to number of companies with at least one discordance
crossref_all %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value, variable) %>%
filter(value > 0) %>%
group_by(EIN) %>%
summarize(
number_variables = length(unique(variable)),
variable = paste(unique(variable), collapse=",<br>")) %>%
left_join(companies_to_ein) %>%
arrange(organization_name) %>%
select(`Organization Name` = `organization_name`,
`Number of Variables Discordant` = number_variables,
`Variables with Discordant Reporting` = variable) %>%
make_table(
title = "Companies with Discordant Reporting for at Least One Variable",
escape=FALSE)| Organization Name | Number of Variables Discordant | Variables with Discordant Reporting |
|---|---|---|
| Ballet Arizona | 2 |
BeginningYearBalanceAmt, EndYearBalanceAmt |
| Fort Wayne Ballet | 4 |
BeginningYearBalanceAmt, InvestmentEarningsOrLossesAmt, OtherExpendituresAmt, EndYearBalanceAmt |
| Joffrey Ballet | 2 |
ContributionsAmt, EndYearBalanceAmt |
| Pacific Northwest Ballet | 2 |
BeginningYearBalanceAmt, EndYearBalanceAmt |
| Pittsburgh Ballet Theatre | 3 |
InvestmentEarningsOrLossesAmt, OtherExpendituresAmt, EndYearBalanceAmt |
| San Francisco Ballet | 5 |
BeginningYearBalanceAmt, ContributionsAmt, InvestmentEarningsOrLossesAmt, OtherExpendituresAmt, EndYearBalanceAmt |
| Texas Ballet Theater | 1 | EndYearBalanceAmt |
| The Alabama Ballet | 4 |
BeginningYearBalanceAmt, InvestmentEarningsOrLossesAmt, OtherExpendituresAmt, EndYearBalanceAmt |
# for each variable, list of EINs that have at least one discordance
intersections <- crossref_all %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value, variable) %>%
filter(value > 0) %>%
group_by(variable) %>%
summarize(EINs = list(unique(EIN)))
discord_in_all <- Reduce(intersect, intersections$EINs) %>% unique() %>% length()
discord_at_least_one <- Reduce(union, intersections$EINs) %>% unique() %>% length() The number of companies with a discordant report for all variables was 1, and the number of companies with at least one discordant report for all variables was 8.
# visualize discordances in given variable_name
plot_reported_for_variable <- function(variable_name, crossref, endowment) {
cross_ref_for_var <- crossref %>%
filter(variable == variable_name) %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
filter(value > 0)
discordant <- cross_ref_for_var %>%
pull(EIN) %>% unique()
number_cols <- ifelse(length(discordant) <= 6, 1,2)
# plot the values for the year they correspond to so we can compare,
# for example, if CM1 for 2016 is the same as CY for 2015
endowment %>%
filter(EIN %in% discordant) %>%
select(EIN, fiscal_year, contains(variable_name)) %>%
group_by(EIN) %>%
arrange(fiscal_year) %>%
pivot_longer(3: ncol(.)) %>%
mutate(source = ifelse(grepl("CYM", name), substr(name, 1,4), "CY"),
year_lag = ifelse(grepl("CYM", name), substr(source, 4,4), 0),
year_lag = as.numeric(year_lag),
fiscal_year = as.integer(paste0(fiscal_year))) %>%
mutate(value_year = fiscal_year -year_lag
) %>%
left_join(companies_to_ein) %>%
mutate(organization_name = paste0(organization_name,
" (EIN: ", EIN, ")")) %>%
ggplot(aes(x = value_year, y = value)) +
geom_jitter(aes(fill=source), height =0,
width = .2,
alpha = .8,
size = 2.2,
shape =21,
color = "black",
stroke =.4) +
# geom_line(aes(group = source, color = source)) +
facet_wrap(~organization_name, scales= 'free_y', ncol = number_cols) +
scale_x_continuous(breaks = 2011:2021 ) +
scale_y_continuous(labels = comma) +
viridis::scale_fill_viridis(option="magma", discrete=TRUE) +
theme_bw() +
labs(x = "Fiscal Year",
y = "Reported Value (Dollars)",
title = paste0("Comparing Reported Values for ", variable_name),
subtitle = "Only Considering Companies with at Least One Discordant Value") +
theme(plot.title = element_text(
face = "bold",
hjust = .5,
size = 16),
axis.title = element_text(face = "bold", size =16),
axis.text = element_text(size = 12),
strip.text = element_text(face = "bold", size = 14),
plot.subtitle=element_text(size =14,
face="italic",
hjust = .5),
legend.text = element_text(size = 10),
legend.title = element_text(face = "bold", size = 12)) +
guides(legend = guide_legend(override.aes = list(size = 3)))
}
# plot variables by year, by variable only for EINs that have
# at least one discordance for a given variable
walk(unique(variables_to_check),
~ {plt <- plot_reported_for_variable(
variable_name = .x,
crossref = crossref_all,
endowment = endowment_data)
print(plt) })Questions to Consider
- Should we assume the most recently reported values are (the most) accurate?
crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
group_by(name) %>%
mutate(count_na = sum(is.na(value)),
count_not_na = sum(!is.na(value))) %>%
ungroup() %>%
ggplot(aes(x = value)) +
# geom_boxplot() +
geom_histogram(bins = 50)
crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
group_by(name) %>%
mutate(count_na = sum(is.na(value)),
count_not_na = sum(!is.na(value))) %>%
ungroup() %>%
ggplot(aes(x = name, y=value)) +
geom_boxplot() +
geom_jitter(alpha = .5, height = 0, width = .01)
crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
group_by(name) %>%
mutate(count_na = sum(is.na(value)),
count_not_na = sum(!is.na(value))) %>%
ungroup() %>%
ggplot(aes(x = name, y = value)) +
# geom_boxplot() +
geom_density()
geom_histogram() +
facet_wrap(~name) +
scale_x_log10()